import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
pd.options.plotting.backend = 'plotly'
from dsc80_utils import * # Feel free to uncomment and use this.
Step 1: Introduction¶
# TODO
interactions_df = pd.read_csv('interactions.csv')
raw_recipes_df = pd.read_csv('RAW_recipes.csv')
interactions_df[interactions_df['rating'] == 0]['review']
3 Just an observation, so I will not rate. I fo...
5 Made my own buttermilk w/ vinegar and milk. U...
10 This is a very good recipe. We also want to c...
...
731888 Delicious ! I tweeked the recipe a bit>>substi...
731893 Just added this mix to a homemade beef & s...
731895 Would this make a good hamburger patty seasoning?
Name: review, Length: 51832, dtype: object
raw_recipes_df
| name | id | minutes | contributor_id | ... | steps | description | ingredients | n_ingredients | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | ['heat the oven to 350f and arrange the rack i... | these are the most; chocolatey, moist, rich, d... | ['bittersweet chocolate', 'unsalted butter', '... | 9 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | ['pre-heat oven the 350 degrees f', 'in a mixi... | this is the recipe that we use at my school ca... | ['white sugar', 'brown sugar', 'salt', 'margar... | 11 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | ['preheat oven to 350 degrees', 'spray a 2 qua... | since there are already 411 recipes for brocco... | ['frozen broccoli cuts', 'cream of chicken sou... | 9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 83779 | zydeco ya ya deviled eggs | 308080 | 40 | 37779 | ... | ['in a bowl , combine the mashed yolks and may... | deviled eggs, cajun-style | ['hard-cooked eggs', 'mayonnaise', 'dijon must... | 8 |
| 83780 | cookies by design cookies on a stick | 298512 | 29 | 506822 | ... | ['place melted butter in a large mixing bowl a... | i've heard of the 'cookies by design' company,... | ['butter', 'eagle brand condensed milk', 'ligh... | 10 |
| 83781 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | ['whip sugar and shortening in a large bowl , ... | i've heard of the 'cookies by design' company,... | ['granulated sugar', 'shortening', 'eggs', 'fl... | 7 |
83782 rows × 12 columns
recipes_full_df = pd.merge(raw_recipes_df, interactions_df, how='left', left_on='id', right_on='recipe_id')
recipes_full_df['rating'] = recipes_full_df['rating'].replace(0, np.nan)
average_rating_per_recipe = recipes_full_df.groupby('recipe_id')['rating'].mean()
average_rating_per_recipe_df = average_rating_per_recipe.reset_index(name='avg_rating')
recipes_df = recipes_full_df.merge(average_rating_per_recipe_df, on='recipe_id', how='left')
recipes_df
| name | id | minutes | contributor_id | ... | date | rating | review | avg_rating | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | 2008-11-19 | 4.0 | These were pretty good, but took forever to ba... | 4.0 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | 2012-01-26 | 5.0 | Originally I was gonna cut the recipe in half ... | 5.0 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | 2008-12-31 | 5.0 | This was one of the best broccoli casseroles t... | 5.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234426 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 2008-06-19 | 1.0 | This recipe tastes nothing like the Cookies by... | 3.0 |
| 234427 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 2010-02-08 | 5.0 | yummy cookies, i love this recipe me and my sm... | 3.0 |
| 234428 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 2014-11-01 | NaN | I work at a Cookies By Design and can say this... | 3.0 |
234429 rows × 18 columns
recipes_df['date']
0 2008-11-19
1 2012-01-26
2 2008-12-31
...
234426 2008-06-19
234427 2010-02-08
234428 2014-11-01
Name: date, Length: 234429, dtype: object
Step 2: Data Cleaning and Exploratory Data Analysis¶
tester_nutrition_line = raw_recipes_df['nutrition'][0]
tester_nutrition_line.strip('[').strip(']').split(',')
['138.4', ' 10.0', ' 50.0', ' 3.0', ' 3.0', ' 19.0', ' 6.0']
# TODO
# separating nutrition column into multiple columns
nutrition_split = recipes_df['nutrition'].str.strip('[').str.strip(']').str.split(',', expand=True)
nutrition_split.columns = ['calories (#)', 'total fat (PDV)', 'sugar (PDV)', 'sodium (PDV)', 'protein (PDV)', 'saturated fat (PDV)', 'carbohydrates (PDV)']
nutrition_split = nutrition_split.apply(pd.to_numeric, errors='coerce')
recipes_df = pd.concat([recipes_df, nutrition_split], axis=1).drop(columns=['nutrition'])
recipes_df
| name | id | minutes | contributor_id | ... | sodium (PDV) | protein (PDV) | saturated fat (PDV) | carbohydrates (PDV) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | 3.0 | 3.0 | 19.0 | 6.0 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | 22.0 | 13.0 | 51.0 | 26.0 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | 32.0 | 22.0 | 36.0 | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234426 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 4.0 | 4.0 | 11.0 | 6.0 |
| 234427 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 4.0 | 4.0 | 11.0 | 6.0 |
| 234428 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 4.0 | 4.0 | 11.0 | 6.0 |
234429 rows × 24 columns
#ingredients, steps, and tags columns are strings that look like lists, changing them to be lists
recipes_cleaned_df = recipes_df.assign(
steps=recipes_df['steps'].str.strip('[]').str.replace("'", "").str.split(', '),
ingredients=recipes_df['ingredients'].str.strip('[]').str.replace("'", "").str.split(', '),
tags=recipes_df['tags'].str.strip('[]').str.replace("'", "").str.split(', ')
)
recipes_cleaned_df
| name | id | minutes | contributor_id | ... | sodium (PDV) | protein (PDV) | saturated fat (PDV) | carbohydrates (PDV) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | 3.0 | 3.0 | 19.0 | 6.0 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | 22.0 | 13.0 | 51.0 | 26.0 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | 32.0 | 22.0 | 36.0 | 3.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234426 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 4.0 | 4.0 | 11.0 | 6.0 |
| 234427 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 4.0 | 4.0 | 11.0 | 6.0 |
| 234428 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 4.0 | 4.0 | 11.0 | 6.0 |
234429 rows × 24 columns
# change date and submitted column to datetime
recipes_cleaned_df['date_review_submitted'] = pd.to_datetime(recipes_cleaned_df['submitted'])
recipes_cleaned_df['date_recipe_posted'] = pd.to_datetime(recipes_cleaned_df['date'])
recipes_cleaned_df = recipes_cleaned_df.drop(columns=['date', 'submitted'])
recipes_cleaned_df
| name | id | minutes | contributor_id | ... | saturated fat (PDV) | carbohydrates (PDV) | date_review_submitted | date_recipe_posted | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | 19.0 | 6.0 | 2008-10-27 | 2008-11-19 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | 51.0 | 26.0 | 2011-04-11 | 2012-01-26 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | 36.0 | 3.0 | 2008-05-30 | 2008-12-31 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234426 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 11.0 | 6.0 | 2008-04-15 | 2008-06-19 |
| 234427 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 11.0 | 6.0 | 2008-04-15 | 2010-02-08 |
| 234428 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 11.0 | 6.0 | 2008-04-15 | 2014-11-01 |
234429 rows × 24 columns
# adding columns for nutrition that is represented in PDV to be in grams
# Pre-2016 mask
mask_pre = recipes_cleaned_df['date_recipe_posted'].dt.year < 2016
recipes_cleaned_df.loc[mask_pre, 'total fat (g)'] = recipes_cleaned_df.loc[mask_pre, 'total fat (PDV)'] * 65 / 100
recipes_cleaned_df.loc[mask_pre, 'sugar (g)'] = recipes_cleaned_df.loc[mask_pre, 'sugar (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_pre, 'sodium (mg)'] = recipes_cleaned_df.loc[mask_pre, 'sodium (PDV)'] * 2400 / 100
recipes_cleaned_df.loc[mask_pre, 'protein (g)'] = recipes_cleaned_df.loc[mask_pre, 'protein (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_pre, 'saturated fat (g)'] = recipes_cleaned_df.loc[mask_pre, 'saturated fat (PDV)'] * 20 / 100
recipes_cleaned_df.loc[mask_pre, 'carbohydrates (g)'] = recipes_cleaned_df.loc[mask_pre, 'carbohydrates (PDV)'] * 300 / 100
# 2016+ mask
mask_post = recipes_cleaned_df['date_recipe_posted'].dt.year >= 2016
recipes_cleaned_df.loc[mask_post, 'total fat (g)'] = recipes_cleaned_df.loc[mask_post, 'total fat (PDV)'] * 78 / 100
recipes_cleaned_df.loc[mask_post, 'sugar (g)'] = recipes_cleaned_df.loc[mask_post, 'sugar (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_post, 'sodium (mg)'] = recipes_cleaned_df.loc[mask_post, 'sodium (PDV)'] * 2300 / 100
recipes_cleaned_df.loc[mask_post, 'protein (g)'] = recipes_cleaned_df.loc[mask_post, 'protein (PDV)'] * 50 / 100
recipes_cleaned_df.loc[mask_post, 'saturated fat (g)'] = recipes_cleaned_df.loc[mask_post, 'saturated fat (PDV)'] * 20 / 100
recipes_cleaned_df.loc[mask_post, 'carbohydrates (g)'] = recipes_cleaned_df.loc[mask_post, 'carbohydrates (PDV)'] * 275 / 100
recipes_cleaned_df
| name | id | minutes | contributor_id | ... | sodium (mg) | protein (g) | saturated fat (g) | carbohydrates (g) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | 72.0 | 1.5 | 3.8 | 18.0 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | 528.0 | 6.5 | 10.2 | 78.0 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | 768.0 | 11.0 | 7.2 | 9.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234426 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 96.0 | 2.0 | 2.2 | 18.0 |
| 234427 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 96.0 | 2.0 | 2.2 | 18.0 |
| 234428 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 96.0 | 2.0 | 2.2 | 18.0 |
234429 rows × 30 columns
fig = px.histogram(recipes_cleaned_df, x='rating', nbins=50, title='Distribution of Ratings')
fig.show()
recipes_cleaned_df['year'] = recipes_cleaned_df['date_review_submitted'].dt.year
grouped = recipes_cleaned_df.groupby('year')['avg_rating'].mean().reset_index()
fig = px.line(grouped, x='year', y='avg_rating', title='Average Rating Over Time')
fig.show()
def clean_tags(val):
if isinstance(val, list):
return val
elif isinstance(val, str):
return val.strip("[]").replace("'", "").split(", ")
else:
return []
recipes_cleaned_df['tags'] = recipes_cleaned_df['tags'].apply(clean_tags)
# baked_keywords = ['baking', 'baked', 'cake', 'cookie', 'bread', 'muffin',
# 'brownie', 'biscuit', 'pie', 'pastry', 'cupcake']
baked_keywords = [
'baking',
'bread-machine',
'bread',
'breads',
'quick-breads',
'cakes',
'cake-fillings-and-frostings',
'cheesecake',
'cupcakes',
'cookies-and-brownies',
'drop-cookies',
'rolled-cookies',
'bar-cookies',
'brownies',
'biscotti',
'muffins',
'pies',
'pies-and-tarts',
'tart',
'tarts',
'scones',
'quiche',
'bread-pudding',
'puddings-and-mousses',
'yeast',
'flat-shapes',
'crusts-pastry-dough-2',
'fillings-and-frostings-chocolate',
'dessert',
'desserts',
'desserts-easy',
'desserts-fruit',
'halloween-cakes',
'halloween-cupcakes',
'baked',
'cake',
'cookie',
'cookies'
'bread',
'muffin',
'brownie',
'brownies',
'biscuit',
'pie',
'pastry',
'cupcake',
'cupcakes'
]
# recipes_cleaned_df['is_baked_good'] = recipes_cleaned_df['tags'].apply(
# lambda tags: any(isinstance(tag, str) and tag.lower() in baked_keywords for tag in tags)
# )
def is_baked(tags, name):
tag_match = any(tag.lower() in baked_keywords for tag in tags if isinstance(tag, str))
name_match = any(keyword in name.lower() for keyword in baked_keywords if isinstance(name, str))
return tag_match or name_match
recipes_cleaned_df['is_baked_good'] = recipes_cleaned_df.apply(
lambda row: is_baked(row['tags'], row['name']),
axis=1
)
recipes_cleaned_df
| name | id | minutes | contributor_id | ... | saturated fat (g) | carbohydrates (g) | year | is_baked_good | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | ... | 3.8 | 18.0 | 2008 | True |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | ... | 10.2 | 78.0 | 2011 | True |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | ... | 7.2 | 9.0 | 2008 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 234426 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 2.2 | 18.0 | 2008 | True |
| 234427 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 2.2 | 18.0 | 2008 | True |
| 234428 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | ... | 2.2 | 18.0 | 2008 | True |
234429 rows × 32 columns
#Filter out outliers for clarity
filtered_df = recipes_cleaned_df[
(recipes_cleaned_df['sugar (g)'] < 300) &
(recipes_cleaned_df['protein (g)'] < 100) &
(recipes_cleaned_df['saturated fat (g)'] < 100) &
(recipes_cleaned_df['carbohydrates (g)'] < 300) &
(recipes_cleaned_df['sodium (mg)'] < 4000) &
(recipes_cleaned_df['calories (#)'] < 2000)
]
sample_df = filtered_df.sample(500, random_state=1)
# Univariate Analysis
fig1 = px.histogram(filtered_df, x='sugar (g)', nbins=50, title='Distribution of Sugar')
fig1.show()
fig2 = px.histogram(filtered_df, x='calories (#)', nbins=50, title='Distribution of Calories')
fig2.show()
# Bivariate Analysis
fig3 = px.box(filtered_df, x='is_baked_good', y='sugar (g)',
title='Sugar Content by Baked Good Label')
fig3.show()
fig4 = px.box(filtered_df, x='is_baked_good', y='protein (g)',
title='Protein Content by Baked Good Label')
fig4.show()
# Interesting Aggregates Table
agg_table = filtered_df.groupby('is_baked_good')[['sugar (g)', 'protein (g)', 'calories (#)']].mean().round(2)
print(agg_table)
sugar (g) protein (g) calories (#) is_baked_good False 16.90 17.68 360.94 True 38.31 8.76 346.79
# Average character length of description, grouped by is_baked_good
avg_lengths = filtered_df.groupby('is_baked_good')['description'].apply(lambda x: x.str.len().mean())
print("📏 Average description lengths:")
print(avg_lengths)
📏 Average description lengths: is_baked_good False 239.60 True 264.18 Name: description, dtype: float64
Step 3: Assessment of Missingness¶
# TODO
missing_counts = recipes_cleaned_df.isna().sum()
missing_df = pd.DataFrame({
'Missing Count': missing_counts,
})
# Filter to only show columns with missing values
missing_df = missing_df[missing_df['Missing Count'] > 0]
# Sort by most missing values
missing_df = missing_df.sort_values(by='Missing Count', ascending=False)
missing_df
# the top four missing columns are the rating, avg_rating, description, and review columns
# the avg_rating column would only be missing if that recipe did not have any ratings so its missingness is MD
# my theory on the missingnss of the rating column: i think that the website has a comment function and our data includes both comments and actual ratings scraped from website
# as such i think the missingness of the rating is NMAR and is a consequence of the data collection method (maybe the web scrapping pulled both comments and reviews or the website has no distinction between them)
# the missingness of the review column could just be that someone decided to leave a rating and not a write a description (NMAR)
# i think the missingness of the description column is quite interesting and odd (maybe could be MAR and depend on the steps column)
# for example, maybe the person writing the recipe decided to not put a description and just put the steps to the recipe
# n_steps and description (recipe that is easier has no description)
| Missing Count | |
|---|---|
| rating | 15036 |
| avg_rating | 2777 |
| description | 114 |
| ... | ... |
| protein (g) | 1 |
| saturated fat (g) | 1 |
| carbohydrates (g) | 1 |
14 rows × 1 columns
# missingness dependency on description column with permutation tests
recipes_cleaned_df['description_missing'] = recipes_cleaned_df['description'].isna()
def permutation_test(df, col_missing, col_test, n_permutations=1000):
copied_df = df.copy()
copied_df = copied_df.dropna(subset=[col_test])
missing_mask = copied_df[col_missing].values
test_values = copied_df[col_test].values
observed_diff = test_values[missing_mask].mean() - test_values[~missing_mask].mean()
diffs = []
for _ in range(n_permutations):
shuffled_mask = np.random.permutation(missing_mask)
diff = test_values[shuffled_mask].mean() - test_values[~shuffled_mask].mean()
diffs.append(diff)
p_value = np.mean(np.abs(diffs) >= np.abs(observed_diff))
return observed_diff, p_value
# missingness dependency: description and n_steps
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'n_steps')
print(f'n_steps: Observed diff: {diff:.4f}, p-value: {p}')
n_steps: Observed diff: 0.7194, p-value: 0.243
# missingness dependency: description and n_ingredients
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'n_ingredients')
print(f'n_ingredients: Observed diff: {diff:.4f}, p-value: {p}')
n_ingredients: Observed diff: -1.1335, p-value: 0.004
# missingness dependency: description and calories
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'calories (#)')
print(f'calories: Observed diff: {diff:.4f}, p-value: {p}')
calories: Observed diff: -101.9820, p-value: 0.046
# missingness dependency: description and average rating
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'avg_rating')
print(f'average rating: Observed diff: {diff:.4f}, p-value: {p}')
average rating: Observed diff: -0.1903, p-value: 0.001
# missingness dependency: description and protein
diff, p = permutation_test(recipes_cleaned_df, 'description_missing', 'protein (g)')
print(f'protein: Observed diff: {diff:.4f}, p-value: {p}')
protein: Observed diff: 2.0542, p-value: 0.276
n_steps: No significant difference in number of steps between recipes with vs without a description. n_ingredients: Yes, statistically significant difference. Recipes without descriptions tend to use fewer ingredients on average. calories: Might be statistically significant but weak significance. Recipes without descriptions might be lower calorie, but this is not statistically significant. avg_rating: Yes, statistically significant. Recipes without descriptions tend to have lower average ratings. protein: No statistical significance in protein content based on description missingness.
Step 4: Hypothesis Testing¶
def tag_based_permutation_test(df, tag_a, tag_b, score_col, n_permutations=1000):
df_exploded = df.explode('tags')
group_a = df_exploded[df_exploded['tags'] == tag_a][score_col].dropna()
group_b = df_exploded[df_exploded['tags'] == tag_b][score_col].dropna()
if len(group_a) == 0 or len(group_b) == 0:
return np.nan, np.nan
observed_diff = group_a.mean() - group_b.mean()
combined = np.concatenate([group_a.values, group_b.values])
diffs = []
for _ in range(n_permutations):
np.random.shuffle(combined)
new_a = combined[:len(group_a)]
new_b = combined[len(group_a):]
diffs.append(new_a.mean() - new_b.mean())
p_value = np.mean(np.abs(diffs) >= np.abs(observed_diff))
return observed_diff, p_value
# What types of recipes tend to have the most calories? (based on tags)
# Dessert vs main dish
# Null hypothesis: Calories are around the same for desserts and main dishes.
# Alternative hypothesis: Calories for main dishes are less than calories for desserts.
diff, p = tag_based_permutation_test(recipes_cleaned_df, 'desserts', 'main-dish', 'calories (#)')
print(f"Observed diff: {diff}, p-value: {p}")
Observed diff: 3.4238541343218003, p-value: 0.443
# What types of recipes tend to have higher average ratings? (based on tags)
# Dessert vs main dish
# Null hypothesis: Average ratings are around the same for desserts and main dishes.
# Alternative hypothesis: Main dishes are on average rated higher than desserts.
diff, p = tag_based_permutation_test(recipes_cleaned_df, 'desserts', 'main-dish', 'avg_rating')
print(f"Observed diff: {diff}, p-value: {p:8f}")
Observed diff: -0.043594616861519775, p-value: 0.000000
def permutation_test_diff(df, group_col, group_a, group_b, value_col, n_permutations=1000):
group1 = df[df[group_col] == group_a][value_col].dropna()
group2 = df[df[group_col] == group_b][value_col].dropna()
if len(group1) == 0 or len(group2) == 0:
return np.nan, np.nan
observed_diff = group1.mean() - group2.mean()
combined = np.concatenate([group1.values, group2.values])
diffs = []
for _ in range(n_permutations):
np.random.shuffle(combined)
new1 = combined[:len(group1)]
new2 = combined[len(group1):]
diffs.append(new1.mean() - new2.mean())
p_value = np.mean(np.abs(diffs) >= np.abs(observed_diff))
return observed_diff, p_value
# What types of recipes tend to be healthier (i.e. more protein, fewer carbs)?
# Are healthier recipes more highly rated recipes?
# Null hypothesis: Healthier recipes are on average rated the same as less healthier recipes.
# Alternative hypothesis: Healthier recipes are on average rated higher than less healthier recipes.
recipes_cleaned_df['health_score'] = recipes_cleaned_df['protein (g)'] - recipes_cleaned_df['carbohydrates (g)']
median_health = recipes_cleaned_df['health_score'].median()
recipes_cleaned_df['high_health'] = recipes_cleaned_df['health_score'] >= median_health
observed_diff, p = permutation_test_diff(recipes_cleaned_df, 'high_health', True, False, 'avg_rating')
print(f"Observed diff: {observed_diff}, p-value: {p:8f}")
Observed diff: 0.02099521814306815, p-value: 0.000000
# What is the relationship between the cooking time and average rating of recipes?
# Null hypothesis: Recipes with longer cook times are on average rated the same as recipes with longer cook times.
# Alternative hypothesis: Recipes with shorter cook times are on average rated better than recipes with longer cook times.
median_time = recipes_cleaned_df['minutes'].median()
recipes_cleaned_df['is_quick'] = recipes_cleaned_df['minutes'] < median_time
observed_diff, p = permutation_test_diff(recipes_cleaned_df, 'is_quick', True, False, 'avg_rating')
print(f"Observed diff: {observed_diff:.4f}, p-value: {p:.8f}")
Observed diff: 0.0342, p-value: 0.00000000
list(recipes_cleaned_df.explode('tags').groupby('tags').count().index)
['', '1-day-or-more', '15-minutes-or-less', '3-steps-or-less', '30-minutes-or-less', '4-hours-or-less', '5-ingredients-or-less', '60-minutes-or-less', 'Throw the ultimate fiesta with this sopaipillas recipe from Food.com.', 'a1-sauce', 'african', 'american', 'amish-mennonite', 'angolan', 'appetizers', 'apples', 'april-fools-day', 'argentine', 'artichoke', 'asian', 'asparagus', 'australian', 'austrian', 'avocado', 'bacon', 'baja', 'baked-beans', 'baking', 'bananas', 'bar-cookies', 'barbecue', 'bass', 'bean-soup', 'beans', 'beans-side-dishes', 'bear', 'beef', 'beef-barley-soup', 'beef-crock-pot', 'beef-kidney', 'beef-liver', 'beef-organ-meats', 'beef-ribs', 'beef-sauces', 'beef-sausage', 'beginner-cook', 'beijing', 'belgian', 'berries', 'beverages', 'birthday', 'biscotti', 'bisques-cream-soups', 'black-bean-soup', 'black-beans', 'blueberries', 'bok-choys', 'brazilian', 'bread-machine', 'bread-pudding', 'breads', 'breakfast', 'breakfast-casseroles', 'breakfast-eggs', 'breakfast-potatoes', 'brewing', 'british-columbian', 'broccoli', 'broil', 'brown-bag', 'brown-rice', 'brownies', 'brunch', 'burgers', 'cabbage', 'cajun', 'cake-fillings-and-frostings', 'cakes', 'californian', 'cambodian', 'camping', 'canadian', 'candy', 'canning', 'cantonese', 'caribbean', 'carrots', 'casseroles', 'catfish', 'cauliflower', 'celebrity', 'central-american', 'chard', 'cheese', 'cheesecake', 'cherries', 'chick-peas-garbanzos', 'chicken', 'chicken-breasts', 'chicken-crock-pot', 'chicken-livers', 'chicken-stew', 'chicken-stews', 'chicken-thighs-legs', 'chilean', 'chili', 'chinese', 'chinese-new-year', 'chocolate', 'chocolate-chip-cookies', 'chowders', 'christmas', 'chutneys', 'cinco-de-mayo', 'citrus', 'clams', 'clear-soups', 'cobblers-and-crisps', 'cocktails', 'coconut', 'cod', 'coffee-cakes', 'collard-greens', 'college', 'colombian', 'comfort-food', 'condiments-etc', 'congolese', 'cookies-and-brownies', 'cooking-mixes', 'copycat', 'corn', 'costa-rican', 'course', 'crab', 'cranberry-sauce', 'crawfish', 'creole', 'crock-pot-main-dish', 'crock-pot-slow-cooker', 'crusts-pastry-dough-2', 'cuban', 'cuisine', 'cupcakes', 'curries', 'czech', 'dairy-free', 'danish', 'deep-fry', 'deer', 'dehydrator', 'desserts', 'desserts-easy', 'desserts-fruit', 'diabetic', 'dietary', 'dinner-party', 'dips', 'dips-lunch-snacks', 'drop-cookies', 'duck', 'duck-breasts', 'dutch', 'easter', 'easy', 'ecuadorean', 'egg-free', 'eggplant', 'eggs', 'eggs-breakfast', 'eggs-dairy', 'egyptian', 'elbow-macaroni', 'elk', 'english', 'equipment', 'ethiopian', 'european', 'fall', 'fathers-day', 'filipino', 'fillings-and-frostings-chocolate', 'finger-food', 'finnish', 'fish', 'flat-shapes', 'food-processor-blender', 'for-1-or-2', 'for-large-groups', 'for-large-groups-holiday-event', 'free-of-something', 'freezer', 'french', 'freshwater-fish', 'from-scratch', 'frozen-desserts', 'fruit', 'fudge', 'garnishes', 'gelatin', 'georgian', 'german', 'gifts', 'gluten-free', 'goose', 'grains', 'granola-and-porridge', 'grapes', 'greek', 'green-yellow-beans', 'greens', 'grilling', 'ground-beef', 'guatemalan', 'gumbo', 'halibut', 'halloween', 'halloween-cakes', 'halloween-cocktails', 'halloween-cupcakes', 'ham', 'ham-and-bean-soup', 'hand-formed-cookies', 'hanukkah', 'hawaiian', 'healthy', 'healthy-2', 'heirloom-historical', 'heirloom-historical-recipes', 'herb-and-spice-mixes', 'hidden-valley-ranch', 'high-calcium', 'high-fiber', 'high-in-something', 'high-in-something-diabetic-friendly', 'high-protein', 'holiday-event', 'honduran', 'hunan', 'hungarian', 'ice-cream', 'icelandic', 'independence-day', 'indian', 'indonesian', 'inexpensive', 'infant-baby-friendly', 'iranian-persian', 'iraqi', 'irish', 'irish-st-patricks-day', 'italian', 'jams-and-preserves', 'japanese', 'jellies', 'jewish-ashkenazi', 'jewish-sephardi', 'kid-friendly', 'kiwifruit', 'korean', 'kosher', 'kwanzaa', 'labor-day', 'lactose', 'lamb-sheep', 'lamb-sheep-main-dish', 'laotian', 'lasagna', 'lasagne', 'lebanese', 'leftovers', 'lemon', 'lentils', 'less_thansql:name_topics_of_recipegreater_than', 'lettuces', 'libyan', 'lime', 'lobster', 'long-grain-rice', 'low-calorie', 'low-carb', 'low-cholesterol', 'low-fat', 'low-in-something', 'low-protein', 'low-saturated-fat', 'low-sodium', 'lunch', 'macaroni-and-cheese', 'mahi-mahi', 'main-dish', 'main-dish-beef', 'main-dish-chicken', 'main-dish-pasta', 'main-dish-pork', 'main-dish-seafood', 'main-ingredient', 'malaysian', 'mango', 'manicotti', 'mardi-gras-carnival', 'marinades-and-rubs', 'marinara-sauce', 'mashed-potatoes', 'meat', 'meatballs', 'meatloaf', 'medium-grain-rice', 'melons', 'memorial-day', 'mexican', 'micro-melanesia', 'microwave', 'middle-eastern', 'middle-eastern-main-dish', 'midwestern', 'mixer', 'mongolian', 'moose', 'moroccan', 'mothers-day', 'muffins', 'mushroom-soup', 'mushrooms', 'mussels', 'namibian', 'native-american', 'nepalese', 'new-years', 'new-zealand', 'nigerian', 'no-cook', 'no-shell-fish', 'non-alcoholic', 'north-american', 'northeastern-united-states', 'norwegian', 'novelty', 'number-of-servings', 'nut-free', 'nuts', 'oamc-freezer-make-ahead', 'oatmeal', 'oaxacan', 'occasion', 'octopus', 'omelets-and-frittatas', 'one-dish-meal', 'onions', 'ontario', 'orange-roughy', 'oranges', 'oven', 'oysters', 'pacific-northwest', 'pakistani', 'palestinian', 'pancakes-and-waffles', 'papaya', 'passover', 'pasta', 'pasta-elbow-macaroni', 'pasta-rice-and-grains', 'pasta-salad', 'pasta-shells', 'peaches', 'peanut-butter', 'pears', 'penne', 'pennsylvania-dutch', 'peppers', 'perch', 'peruvian', 'pheasant', 'pickeral', 'picnic', 'pies', 'pies-and-tarts', 'pineapple', 'pitted-fruit', 'pizza', 'plums', 'polish', 'polynesian', 'pork', 'pork-chops', 'pork-crock-pot', 'pork-loin', 'pork-loins', 'pork-loins-roast', 'pork-ribs', 'pork-sausage', 'portuguese', 'pot-pie', 'pot-roast', 'potatoes', 'potluck', 'poultry', 'preparation', 'prepared-potatoes', 'presentation', 'pressure-canning', 'pressure-cooker', 'puddings-and-mousses', 'puerto-rican', 'pumpkin', 'pumpkin-bread', 'punch', 'quebec', 'quiche', 'quick-breads', 'rabbit', 'ragu-recipe-contest', 'ramadan', 'raspberries', 'ravioli-tortellini', 'refrigerator', 'reynolds-wrap', 'rice', 'roast', 'roast-beef', 'roast-beef-comfort-food', 'roast-beef-main-dish', 'rolled-cookies', 'rolls-biscuits', 'romantic', 'rosh-hashana', 'rosh-hashanah', 'russian', 'salad-dressings', 'salads', 'salmon', 'salsas', 'saltwater-fish', 'sandwiches', 'sauces', 'saudi-arabian', 'savory', 'savory-pies', 'savory-sauces', 'scallops', 'scandinavian', 'scones', 'scottish', 'seafood', 'seasonal', 'served-cold', 'served-hot', 'served-hot-new-years', 'shakes', 'shellfish', 'short-grain-rice', 'shrimp', 'shrimp-main-dish', 'side-dishes', 'side-dishes-beans', 'simply-potatoes', 'simply-potatoes2', 'small-appliance', 'smoker', 'smoothies', 'snacks', 'snacks-kid-friendly', 'snacks-sweet', 'sole-and-flounder', 'somalian', 'soul', 'soups-stews', 'sourdough', 'south-african', 'south-american', 'south-west-pacific', 'southern-united-states', 'southwestern-united-states', 'soy-tofu', 'spaghetti', 'spaghetti-sauce', 'spanish', 'spicy', 'spinach', 'spreads', 'spring', 'squash', 'squid', 'st-patricks-day', 'steak', 'steaks', 'steam', 'stews', 'stews-poultry', 'stir-fry', 'stocks', 'stove-top', 'strawberries', 'stuffings-dressings', 'sudanese', 'sugar-cookies', 'summer', 'super-bowl', 'superbowl', 'swedish', 'sweet', 'sweet-sauces', 'swiss', 'szechuan', 'tarts', 'taste-mood', 'technique', 'tempeh', 'tex-mex', 'thai', 'thanksgiving', 'tilapia', 'time-to-make', 'to-go', 'toddler-friendly', 'tomatoes', 'tropical-fruit', 'trout', 'tuna', 'turkey', 'turkey-breasts', 'turkey-burgers', 'turkish', 'unprocessed-freezer', 'valentines-day', 'veal', 'vegan', 'vegetables', 'vegetarian', 'veggie-burgers', 'venezuelan', 'very-low-carbs', 'vietnamese', 'water-bath', 'wedding', 'weeknight', 'welsh', 'white-rice', 'whitefish', 'whole-chicken', 'whole-duck', 'whole-turkey', 'wild-game', 'wings', 'winter', 'yams-sweet-potatoes', 'yeast', 'zucchini']
# Step 4: Hypothesis Testing — Do baked goods have significantly more sugar than non-baked goods?
baked = filtered_df[filtered_df['is_baked_good'] == True]['sugar (g)']
non_baked = filtered_df[filtered_df['is_baked_good'] == False]['sugar (g)']
observed_stat = baked.mean() - non_baked.mean()
# Setup for permutation test
sugar = filtered_df['sugar (g)'].values
labels = filtered_df['is_baked_good'].values
n_reps = 5000
perm_stats = []
for _ in range(n_reps):
shuffled_labels = np.random.permutation(labels)
group1 = sugar[shuffled_labels == True]
group2 = sugar[shuffled_labels == False]
stat = group1.mean() - group2.mean()
perm_stats.append(stat)
perm_stats = np.array(perm_stats)
p_value = np.mean(perm_stats >= observed_stat)
print(f"Observed Statistic: {observed_stat:.4f}")
print(f"p-value: {p_value:.4f}")
Observed Statistic: 21.4139 p-value: 0.0000
# Step 4: Hypothesis Testing — Do non-baked goods have significantly more protein than baked goods?
baked = filtered_df[filtered_df['is_baked_good'] == True]['protein (g)']
non_baked = filtered_df[filtered_df['is_baked_good'] == False]['protein (g)']
observed_stat = non_baked.mean() - baked.mean()
# Setup for permutation test
protein = filtered_df['protein (g)'].values
labels = filtered_df['is_baked_good'].values
n_reps = 5000
perm_stats = []
for _ in range(n_reps):
shuffled_labels = np.random.permutation(labels)
group1 = protein[shuffled_labels == True]
group2 = protein[shuffled_labels == False]
stat = group2.mean() - group1.mean()
perm_stats.append(stat)
perm_stats = np.array(perm_stats)
p_value = np.mean(perm_stats >= observed_stat)
print(f"Observed Statistic: {observed_stat:.4f}")
print(f"p-value: {p_value:.4f}")
Observed Statistic: 8.9204 p-value: 0.0000
Step 5: Framing a Prediction Problem¶
# We will be doing a classification problem on predicting the is_baked_good column.
# It is binary classification considering that we only have True or False outputs.
filtered_df.columns
Index(['name', 'id', 'minutes', 'contributor_id', 'tags', 'n_steps', 'steps',
'description', 'ingredients', 'n_ingredients', 'user_id', 'recipe_id',
'rating', 'review', 'avg_rating', 'calories (#)', 'total fat (PDV)',
'sugar (PDV)', 'sodium (PDV)', 'protein (PDV)', 'saturated fat (PDV)',
'carbohydrates (PDV)', 'date_review_submitted', 'date_recipe_posted',
'total fat (g)', 'sugar (g)', 'sodium (mg)', 'protein (g)',
'saturated fat (g)', 'carbohydrates (g)', 'year', 'is_baked_good'],
dtype='object')
Step 6: Baseline Model¶
# We will use the sugar and protein columns in the baseline model.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Features and label
X = filtered_df[['sugar (g)', 'protein (g)']].copy()
y = filtered_df['is_baked_good']
# Full pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[
('classifier', LogisticRegression())
])
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.76 0.96 0.84 40566
True 0.66 0.22 0.33 16024
accuracy 0.75 56590
macro avg 0.71 0.59 0.59 56590
weighted avg 0.73 0.75 0.70 56590
Step 7: Final Model¶
# adding another column
# We will use the sugar and protein AND SODIUM columns in the baseline model.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']
# Preprocessing for numeric features
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
# ColumnTransformer for numeric preprocessing
preprocessor = ColumnTransformer(
transformers=[('num', numeric_transformer, numeric_features)]
)
# Full pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.78 0.95 0.85 40566
True 0.70 0.31 0.43 16024
accuracy 0.77 56590
macro avg 0.74 0.63 0.64 56590
weighted avg 0.76 0.77 0.73 56590
# adding another column
# We will use the sugar and protein AND SODIUM columns in the baseline model.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']
# Preprocessing for numeric features
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
# ColumnTransformer for numeric preprocessing
preprocessor = ColumnTransformer(
transformers=[('num', numeric_transformer, numeric_features)]
)
# Full pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(class_weight='balanced'))
])
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.86 0.74 0.80 40566
True 0.52 0.70 0.60 16024
accuracy 0.73 56590
macro avg 0.69 0.72 0.70 56590
weighted avg 0.77 0.73 0.74 56590
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy().to_numpy()
y = filtered_df['is_baked_good'].to_numpy()
# Preprocessing and model
numeric_features = [0, 1, 2] # since we're using a NumPy array now
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features)
])
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression(class_weight='balanced'))
])
# K-Fold setup
kf = KFold(n_splits=5, shuffle=True)
# Storage for metrics
accuracies, precisions, recalls, f1s = [], [], [], []
# Run k-fold CV
for train_index, val_index in kf.split(X):
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
accuracies.append(accuracy_score(y_val, y_pred))
precisions.append(precision_score(y_val, y_pred))
recalls.append(recall_score(y_val, y_pred))
f1s.append(f1_score(y_val, y_pred))
# Report mean results
print(f"Accuracy: {np.mean(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f}")
print(f"F1 Score: {np.mean(f1s):.4f}")
Accuracy: 0.7297 Precision: 0.5166 Recall: 0.7065 F1 Score: 0.5968
# decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']
# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features)
])
# Pipeline with Decision Tree
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier())
])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y
)
# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.92 0.95 0.94 40566
True 0.86 0.80 0.83 16024
accuracy 0.91 56590
macro avg 0.89 0.87 0.88 56590
weighted avg 0.90 0.91 0.90 56590
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1s = [], [], [], []
for train_idx, test_idx in kf.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))
# Display average results
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")
Average Accuracy: 0.9082 Average Precision: 0.8587 Average Recall: 0.8088 Average F1 Score: 0.8330
# decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Features and label
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']].copy()
y = filtered_df['is_baked_good']
# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features)
])
# Pipeline with Decision Tree
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier(max_depth=10, class_weight='balanced'))
])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y
)
# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.88 0.82 0.85 40566
True 0.61 0.73 0.66 16024
accuracy 0.79 56590
macro avg 0.75 0.77 0.76 56590
weighted avg 0.81 0.79 0.80 56590
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True)
accuracies, precisions, recalls, f1s = [], [], [], []
for train_idx, test_idx in kf.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))
# Display average results
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")
Average Accuracy: 0.7919 Average Precision: 0.6100 Average Recall: 0.7355 Average F1 Score: 0.6669
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# Features and target
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']]
y = filtered_df['is_baked_good']
# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features)
])
# Pipeline setup
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier())
])
# Parameter grid to search
param_grid = {
'classifier__max_depth': [4, 6, 8, 10],
'classifier__min_samples_leaf': [5, 10, 20],
'classifier__min_samples_split': [10, 20, 40],
'classifier__criterion': ['gini', 'entropy'],
'classifier__class_weight': ['balanced']
}
# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5,
scoring='f1',
verbose=1
)
# Fit grid search
grid_search.fit(X, y)
# Output best results
print("Best parameters:")
print(grid_search.best_params_)
print("\nBest F1 score:")
print(grid_search.best_score_)
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters:
{'classifier__class_weight': 'balanced', 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 20, 'classifier__min_samples_split': 10}
Best F1 score:
0.6075506522398966
# random forests
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Features and target
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']]
y = filtered_df['is_baked_good']
# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features)
])
# Random Forest pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(
n_estimators=100,
max_depth=10,
bootstrap=True,
class_weight='balanced'))
])
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y
)
# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.89 0.85 0.87 40566
True 0.65 0.72 0.69 16024
accuracy 0.81 56590
macro avg 0.77 0.79 0.78 56590
weighted avg 0.82 0.81 0.82 56590
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# Features and target
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps']]
y = filtered_df['is_baked_good']
# Preprocessing
numeric_features = ['sugar (g)', 'protein (g)', 'n_steps']
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features)
])
# Pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(class_weight='balanced'))
])
# Grid of hyperparameters
param_grid = {
'classifier__n_estimators': [10, 50, 100],
'classifier__max_depth': [None, 6, 10],
'classifier__bootstrap': [True, False],
'classifier__class_weight': ['balanced']
}
# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=3,
scoring='f1',
n_jobs=2,
verbose=2
)
# Fit the grid search
grid_search.fit(X, y)
# Output best results
print("✅ Best parameters:")
print(grid_search.best_params_)
print("\n📈 Best F1 score:")
print(grid_search.best_score_)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 0.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 0.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 0.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 3.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 3.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 3.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 6.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 6.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 0.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 0.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 0.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 6.4s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 2.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 2.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 0.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 0.4s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 0.4s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 2.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 2.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 4.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 1.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 0.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 4.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 0.9s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 4.3s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 4.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 4.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 8.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 8.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 0.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 0.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 0.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 1.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 1.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 8.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 1.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 3.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 3.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 0.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 0.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 0.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 3.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 2.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 2.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 2.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 5.2s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 5.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 5.2s
✅ Best parameters:
{'classifier__bootstrap': True, 'classifier__class_weight': 'balanced', 'classifier__max_depth': 10, 'classifier__n_estimators': 100}
📈 Best F1 score:
0.6226984077766446
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True)
accuracies, precisions, recalls, f1s = [], [], [], []
for train_idx, test_idx in kf.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))
# Display average results
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")
Average Accuracy: 0.9067 Average Precision: 0.8195 Average Recall: 0.8599 Average F1 Score: 0.8392
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# --- Feature engineering functions ---
# Calories per ingredient
def add_calories_per_ingredient(X):
calories = X[:, 0]
n_ingredients = X[:, 1]
return (calories / n_ingredients).reshape(-1, 1)
# Description word count
def add_description_word_count(X):
return X.iloc[:, 0].apply(lambda desc: len(str(desc).split()) if isinstance(desc, str) else 0).to_numpy().reshape(-1, 1)
# --- Define features and target ---
X = filtered_df[['sugar (g)', 'protein (g)', 'n_steps', 'calories (#)', 'n_ingredients', 'description']]
y = filtered_df['is_baked_good']
# --- Unified ColumnTransformer with all transformations ---
preprocessor = ColumnTransformer(transformers=[
('scaled_numeric', StandardScaler(), ['sugar (g)', 'protein (g)', 'n_steps']),
('cal_per_ing', FunctionTransformer(add_calories_per_ingredient, validate=True), ['calories (#)', 'n_ingredients']),
('desc_len', FunctionTransformer(add_description_word_count, validate=False), ['description'])
])
# --- Final pipeline ---
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(
n_estimators=100,
max_depth=10,
bootstrap=True,
class_weight='balanced'
))
])
# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
# --- Fit and evaluate ---
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support
False 0.89 0.86 0.88 40566
True 0.67 0.74 0.71 16024
accuracy 0.82 56590
macro avg 0.78 0.80 0.79 56590
weighted avg 0.83 0.82 0.83 56590
# Grid of hyperparameters
param_grid = {
'classifier__n_estimators': [10, 50, 100],
'classifier__max_depth': [None, 6, 10],
'classifier__bootstrap': [True, False],
'classifier__class_weight': ['balanced']
}
# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=5,
scoring='f1',
n_jobs=3,
verbose=2
)
# Fit the grid search
grid_search.fit(X, y)
# Output best results
print("✅ Best parameters:")
print(grid_search.best_params_)
print("\n📈 Best F1 score:")
print(grid_search.best_score_)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 2.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 2.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 9.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 9.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 9.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 9.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 9.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 17.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 17.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 17.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.3s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 17.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 4.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 4.2s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 18.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 4.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 7.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 7.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 7.7s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 7.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 7.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 1.6s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 5.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 5.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 5.8s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 5.9s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 6.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 11.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 11.5s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 3.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 3.1s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 3.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 3.0s
[CV] END classifier__bootstrap=True, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=10; total time= 3.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 13.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 13.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 13.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 13.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=50; total time= 13.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 26.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 27.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 26.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=10; total time= 1.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 5.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 26.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 5.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 5.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 5.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=None, classifier__n_estimators=100; total time= 27.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=50; total time= 5.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 11.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 11.3s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 11.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 2.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 11.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=10; total time= 2.1s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=6, classifier__n_estimators=100; total time= 11.0s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 8.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 8.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 8.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 8.4s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=50; total time= 8.5s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 16.6s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 16.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 16.7s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 15.8s
[CV] END classifier__bootstrap=False, classifier__class_weight=balanced, classifier__max_depth=10, classifier__n_estimators=100; total time= 14.4s
✅ Best parameters:
{'classifier__bootstrap': True, 'classifier__class_weight': 'balanced', 'classifier__max_depth': 10, 'classifier__n_estimators': 100}
📈 Best F1 score:
0.6357663756207149
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Use X and y as pandas DataFrame/Series — not NumPy arrays
kf = KFold(n_splits=5, shuffle=True)
accuracies, precisions, recalls, f1s = [], [], [], []
for train_idx, test_idx in kf.split(X):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))
# Display average results
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")
Average Accuracy: 0.8243 Average Precision: 0.6719 Average Recall: 0.7418 Average F1 Score: 0.7051
Step 8: Fairness Analysis¶
# TODO